继续在第117~170行之间添加DEBUG_TEXT. 出错范围缩小到133~165行。
133 DEBUG_TEXT(DFDB_ROSCORE, 8, "FragmentRequest::execute calling dc->getFragment for L1Id "<< m_level1Id<< std::endl);//已被打印 134 for (std::vector<DataChannel*>::iterator dc=chanStartIter; dc!=chanEndIter; dc++) { 135 DEBUG_TEXT(DFDB_ROSCORE, 20, "FragmentRequest::execute calling dc->getFragment for L1Id "<< m_level1Id<< std::endl); 136 EventFragment* subFragment = ((*dc)->getFragment(m_ticket[index])) ; 137 if (subFragment != 0) { 138 partsReceived++; 139 DEBUG_TEXT(DFDB_ROSCORE, 20, "FragmentRequest::execute calling builder->appendFragment for L1Id "<< m_level1Id<< std::endl ); 140 m_builder->appendFragment(m_eventFragment,subFragment); 141 TS_RECORD(TS_H1,2350); 142 143 fragmentOk=subFragment->fragmentReady(); 144 145 s_mutex->lock() ; 146 delete subFragment; 147 s_mutex->unlock() ; 148 } 149 else { 150 DEBUG_TEXT(DFDB_ROSCORE, 8, "FragmentRequest for L1Id "<< m_level1Id << " missing data aborting "); 151 fragmentOk=false; 152 TS_RECORD(TS_H1,2360); 153 } 154 155 index++; 156 } 157 158 159 bool retired=false; 160 if (!fragmentOk) { 161 retired=checkAge(s_maxAge); 162 } 163 164 165 DEBUG_TEXT(DFDB_ROSCORE, 8, "FragmentRequest::execute L1Id:"<< m_level1Id<< ",fragmentOk="<< fragmentOk<<",retired="<<retired<<" ,partsReceived="<<partsReceived<< std::endl);//未被打印
先在log文件找到requestOk:-1,确定是196635 出现问题。
[lhaaso@cmm03node01 part_dk_ef]$ grep "FragmentRequest::execute calling dc->getFragment for L1Id 196635" ROS-Eth-00_cmm03node01_1487253328.out Debug(13,140405022635776): FragmentRequest::execute calling dc->getFragment for L1Id 196635 [lhaaso@cmm03node01 part_dk_ef]$ grep "Debug(13,140405022635776): FragmentRequest::execute L1Id:196635" ROS-Eth-00_cmm03node01_1487253328.out [lhaaso@cmm03node01 part_dk_ef]$
继续加打印缩小范围,定位到是在第136行出现的问题:
136 EventFragment* subFragment = ((*dc)->getFragment(m_ticket[index])) ;
PCMemoryDataChannel.cpp 里的PCMemoryDataChannel::getFragment()第200行有打印出来,猜测错误应该是出现在try块里。继续缩小范围,定位到第203行出现问题。
198 if((intptr_t)ed == EventInputManager::EIM_MAYCOME || (intptr_t)ed == EventInputManager::EIM_NEVERTOCOME) 199 { 200 DEBUG_TEXT(DFDB_ROSFM, 8, "EIM_MAYCOME || EIM_NEVERTOCOME " << ticket << std::endl); 201 try 202 { 203 fragment = new ROBFragment(m_memoryPool, ticket, m_sourceIdentifier, 0); //create an empty ROB fragment //出现问题的代码 204 Buffer *mem_buffer = fragment->buffer(); //get the Buffer of the ROB fragment 205 Buffer::page_iterator mem_page_i = mem_buffer->begin(); 206 MemoryPage *mem_page = const_cast<MemoryPage *>(*mem_page_i); //get the memory page of the buffer 207 mem_page->lock(); 208 209 evDesc_t *ed = m_eventInputManager->getEventDescriptor(mem_page); //get a pointer to the event descriptor 210 ed->L1id = ticket; //set the L1ID 211 m_eventInputManager->createEvent(ed); //Insert the event into the Event Input Manager 212 213 if ((intptr_t)ed == EventInputManager::EIM_MAYCOME) 214 { 215 m_statistics->fragmentsMissed++; 216 fragment->setStatus(EventFragment::STATUS_MAYCOME); 217 DEBUG_TEXT(DFDB_ROSFM, 10, "PCMemoryDataChannel::getFragment: Fragment for L1ID " << ticket << " has not yet arrived"); 218 } 219 220 if ((intptr_t)ed == EventInputManager::EIM_NEVERTOCOME) 221 { 222 m_statistics->fragmentsLost++; 223 fragment->setStatus(EventFragment::STATUS_LOST); 224 DEBUG_TEXT(DFDB_ROSFM, 10, "PCMemoryDataChannel::getFragment: Fragment for L1ID " << ticket << " does not exist"); 225 CREATE_ROS_EXCEPTION(ex1, CoreException, PCMEMCHAN_LOST, " L1ID = " << ticket << ", ROL physical addr = " << physicalAd dress()); 226 ers::warning(ex1); 227 }
查看ROBFragment.cpp中ROBFragment的构造函数如下:
198 /********************************************************************************************/ 199 ROBFragment::ROBFragment(MemoryPool* mempool, u_int level1Id, u_int sourceId, u_int runNumber) 200 /********************************************************************************************/ 201 { 202 DEBUG_TEXT(DFDB_ROSEF, 8 , "Lost event " << level1Id << " begin to created"); //已被打印 203 // This constructor is for the (hopefully) rare case that a ROD fragment 204 // does not get delivered by the ROL and the FragmentManager has to 205 // return an empty ROB fragment 206 207 m_buffer = new Buffer(mempool); //出现问题代码行 208 209 // Build the ROB header 210 DEBUG_TEXT(DFDB_ROSEF, 8, "calling initialiseHeader "<<level1Id << std::endl); //未被打印 211 initialiseHeader(sourceId, STATUS_TIMEOUT); 212 213 DEBUG_TEXT(DFDB_ROSEF, 8, "ROBFragment::ROBFragment(lost): s_formatVersionNumber for ROD header is " << s_formatVersionNumber << " " << level1Id << std::endl); 214 // Build the ROD header 215 m_rodheader = new(m_buffer) RODFragment::RODHeader; 216 DEBUG_TEXT(DFDB_ROSEF, 8, "m_rodheader is at " << m_rodheader << " " << level1Id << std::endl); 217 m_rodheader->startOfHeaderMarker = s_rodMarker; 218 m_rodheader->headerSize = sizeof(RODFragment::RODHeader) / sizeof (u_int); 219 m_rodheader->formatVersionNumber = s_rodformatVersionNumber; 220 //The source ID of the ROD header should not be identical to that of the ROB header. As we don't know 221 //it (without additional tricks in the FM) I duplicate it anyway. FIXME 222 m_rodheader->sourceIdentifier = sourceId & 0xffffff; 223 m_rodheader->level1Id = level1Id; 224 m_rodheader->bunchCrossingId = 0; 225 m_rodheader->level1TriggerType = 0; 226 m_rodheader->detectorEventType = 0; 227 m_rodheader->runNumber = runNumber; 228 229 // Build the ROD body (just one status word) 230 m_rodbody = new(m_buffer) u_int[1]; 231 *m_rodbody = STATUS_TIMEOUT; // Error status 232 DEBUG_TEXT(DFDB_ROSEF, 8, "m_rodbody is at " << m_rodbody << " " << level1Id << std::endl); 233 234 // Build the ROD trailer 235 m_rodtrailer = new(m_buffer) RODFragment::RODTrailer; 236 DEBUG_TEXT(DFDB_ROSEF, 8, "m_rodtrailer is at " << m_rodtrailer << " " << level1Id << std::endl); 237 m_rodtrailer->numberOfStatusElements = 1; 238 m_rodtrailer->numberOfDataElements = 0; 239 m_rodtrailer->statusBlockPosition = 0; 240 241 // Generic ROB header 242 int rodsize = RODFragment::s_rodheaderSize + 1 + RODFragment::s_rodtrailerSize; 243 m_header->generic.totalFragmentsize = s_robheaderSize + rodsize; 244 245 //No ROB trailer. crc_flag is 0 246 247 m_rodFragmentExists = 1; 248 DEBUG_TEXT(DFDB_ROSEF, 8 , "Lost event " << level1Id << " created"); 249 }
查看../../ROSBufferManagement/src/Buffer.cpp
121 Buffer::Buffer(MemoryPool *memoryPool) 122 : m_memoryPool(memoryPool), 123 m_size(0), 124 m_lastMemoryPage(m_memoryPool->getPage()), 125 m_pageSize(m_memoryPool->pageSize()), 126 m_numberOfPages(1), 127 m_current(0), 128 m_reserved(0) 129 { 130 m_pages[0]=m_lastMemoryPage; 131 }
../../ROSMemoryPool/ROSMemoryPool/MemoryPool.h
121 inline MemoryPage * MemoryPool::getPage() 122 { 123 if (m_freeIndex >= m_noPages) 124 throw MemoryPoolException(MemoryPoolException::NOPAGESAVAILABLE); 125 126 MemoryPage *rc = (*m_pageVector)[m_freeIndex]; 127 (*m_pageVector)[m_freeIndex] = 0; 128 m_freeIndex++; 129 return rc; 130 }
出现问题的原因:
FragmentRequest::execute()执行时, getFragment时没找到对应L1id的ROBFragment, 于是要做一个空的ROBFragment,在getPage时报错。
(批注:为什么会出现找不到ROB的情况呢?因为ROS是在数据到齐的情况下才会向L2SV发送消息,然后消息传递到SFI, SFI才会向ROS请求对应L1id的数据,这个时候缺少ROB从逻辑上说是不对的。所以怀疑检查数据完整性的逻辑是不是有问题。)